#DOJScraper.py
import urllib
import re 
import os
import csv
import numpy
from threading import Thread


# Threading global
nthreads = 10

## Get a list of links
linksite = urllib.urlopen('https://www.justice.gov/archive/justice-news-archive.html').read()
months = re.findall('<a href="/archive/opa/pr/(.+?)"', linksite, re.DOTALL)
months.sort()

# Make a list of date, url combos
urllist = []
# Add list of failure links when DOJ shits itself
faillist = []


print("Collecting Links by Month")
# Loop over months, finding links 
def collectlinks(monthlist):
	for month in monthlist:
	#	
		print(month)
		# Go to month list 
		url = "https://www.justice.gov/archive/opa/pr/" + month
#
		site = urllib.urlopen(url).read()
#
		links = re.findall('<a href="/archive/opa/pr/(.+?)"', site, re.DOTALL)
#
		for link in links:
			full = "https://www.justice.gov/archive/opa/pr/" + link
			urllist.append([month, full])
#
		# For older pages, just a list of links without the /opa/pr/ pattern
		site = site.lower()
		if '<a href ="/">' not in site:
			links2 = re.findall('<a href="(.+?)"', site, re.DOTALL)
			for link in links2:
				full = url + "/" + link
				urllist.append([month, full])
#
		try:
			assert(len(links)>0 or len(links2)>0)
			complete.append(month)

		except:
			print("No Links at page " + url)
			faillist.append(month)



monthlists = numpy.array_split(months,nthreads)
threads = []
complete = []

for j in range(nthreads):
    t = Thread(target=collectlinks, args=(monthlists[j].tolist(),))
    t.start()
    threads.append(t)

for t in threads:
	t.join()

print("Retrying %d Failures" %len(faillist))
collectlinks(faillist)

complete.sort()
print("Completed " + str(len(complete)) + " Out of " + str(len(months)) + " Months")
assert(complete == months)
print("All Months Complete")


PRList = []

print("Scraping " + str(len(urllist)) + " Links")

## Looping over links is slow so we multithread

def scrape(pagelist):
	for i in range(len(pagelist)): 	
		page = pagelist[i]
	 	month = page[0]

		text = urllib.urlopen(page[1]).read()

		# Clean it up

		text = re.sub("<.+?>", '', text)
		text = re.sub("<.+?>", '', text)

		text = text.replace("\n", " ")
		text = text.replace("\r", " ")
		text = text.replace("\t", " ")
		text = text.replace(",", " ")
		text = text.replace("&nbsp", " ")

		# Bring together multiple whitespace
		text = re.sub("( )+", " ", text)

		## Handle unicode after scraping to make sure it doesn't break 

		PRList.append([month, text])
		print(len(PRList))


PRList = []
urllists = numpy.array_split(urllist,nthreads)
threads = []
for i in range(nthreads):
    t = Thread(target=scrape, args=(urllists[i].tolist(),))
    t.start()
    threads.append(t)

for t in threads:
	t.join()


with open("archivePRList.csv","w") as f:
	wr = csv.writer(f)
	wr.writerows(PRList)




